We assumed: * k-5 is elementary school * 6-8 is middle school * 9-12 is high school Calculations were made with this in mind. We are aware that some schools are only k-2, so students will be zoned differently for grade 3, but we are interested in the zone of kindergarten/lowest grade a students entered the system.
grades <- c(0, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12)
schools <-c("em", "em", "em" , "em" , "em", "em", "mid" , "mid", "mid", "hs", "hs", "hs", "hs")
map_grade <- data.frame(grades, schools)
K is 0, all other grades before K are dropped because students are not zoned for school until Kindergarten.
all_bios <- bios
#bios_noNA <- all_bios%>%
# filter(!is.na(student_id_scram)) #only one student id is na
bios_dbn <- all_bios %>%
select(student_id_scram, dbn, year, grade_level)
bios_dbn <- bios_dbn %>% mutate(grade_level = ifelse(grade_level == "0K", "00", grade_level))
bios_dbn <- bios_dbn %>% mutate(grade_level = as.integer(grade_level)) %>% filter(!is.na(grade_level))
## Warning in evalq(as.integer(grade_level), <environment>): NAs introduced by
## coercion
#join data so you see students
bios_dbn <- bios_dbn %>% full_join(map_grade, by = c("grade_level" = "grades"))
################################ MOST IMPORTANT ###############################################
test<- bios_dbn %>% group_by(student_id_scram, schools, year, dbn) %>% summarise()
#only keep entry to each school level for each student
bio_distinct_schools <- test %>% group_by(student_id_scram, schools) %>% filter(year == min(year))
####################################################################################################
#view
#bios_dbn %>% group_by(student_id_scram, dbn, schools) %>% summarise()%>% arrange(student_id_scram) %>% View()
smaller_data <- bios_dbn %>% group_by(student_id_scram, dbn, schools) %>% summarise()
unique_data <- smaller_data %>% group_by(student_id_scram, schools) %>%
distinct(student_id_scram, schools, .keep_all = TRUE)
#it works!
# unique_data %>%
# group_by(student_id_scram) %>%
# summarise(count =n())%>%
# group_by(count) %>%
# summarise(countx = n())
#spread this now to get distance from zone to school. IMPORTANT: We only took one mid, high, em for each student.
#Some students changed schools, and we would not see this from this table.
#################################################################
bios_spread <- unique_data %>% spread(schools, dbn)
##################################################################
#display bio_distinct_schools
bio_distinct_schools
## # A tibble: 3,794,610 x 4
## # Groups: student_id_scram, schools [3,794,610]
## student_id_scram schools year dbn
## <int> <fctr> <dbl> <chr>
## 1 2160 em 2005 20K185
## 2 2160 hs 2010 02M475
## 3 2160 mid 2007 20K187
## 4 2240 hs 2005 08X650
## 5 2259 em 2013 21K188
## 6 2320 em 2005 04M050
## 7 2320 hs 2010 02M655
## 8 2320 mid 2007 04M050
## 9 2339 em 2005 11X078
## 10 2339 hs 2009 08X405
## # ... with 3,794,600 more rows
#############################
#ZONE DATA DOES NOT INCLUDE ANYTHING FROM 2005 or 2006
#############################
#sanity checks are commented
# zoneData %>% select(student_id_scram, zoned_elm_dbn, zoned_mid_dbn, zoned_hs_dbn, year) %>%
# group_by(student_id_scram) %>% summarise(count = n()) %>% group_by(count) %>% summarise(countx = n())
# zoneData %>% select(student_id_scram, zoned_elm_dbn, zoned_mid_dbn, zoned_hs_dbn, year) %>%
# group_by(student_id_scram) %>% summarise(count = n()) %>%
# filter(count == 9) %>% head(1)
#zoneData %>% filter(student_id_scram == 12507)
distinct_zones <- zoneData %>% select(1:8) %>% #used to be callued df_distinct
group_by(student_id_scram, zoned_elm_dbn, zoned_mid_dbn, zoned_hs_dbn, year) %>%
distinct(student_id_scram, schools, .keep_all = TRUE)
#df_distinct %>% group_by(year) %>% summarise()
distinct_zones
## # A tibble: 9,502,347 x 8
## # Groups: student_id_scram, zoned_elm_dbn, zoned_mid_dbn, zoned_hs_dbn,
## # year [9,502,347]
## student_id_scram res_zip_cde zoned_elm_dbn zoned_mid_dbn zoned_hs_dbn
## <int> <chr> <chr> <chr> <chr>
## 1 126202521 10009 01M015 01M973 71M985
## 2 892202621 10463 10X007 10X368 72X981
## 3 173202621 10009 01M063 01M972 71M985
## 4 428202887 10009 01M064 01M973 71M985
## 5 425202898 10002 01M188 01M973 71M985
## 6 452202349 10461 11X108 11X144 11X415
## 7 205202349 10009 01M064 01M972 71M985
## 8 775202349 10009 01M015 01M973 71M985
## 9 730302510 10002 01M974 01M973 71M985
## 10 862302510 10009 01M064 01M973 71M985
## # ... with 9,502,337 more rows, and 3 more variables: census_block <chr>,
## # census_tract <chr>, year <dbl>
We joined the data on DBNs from the zoned data and the DBN’s from the bio data on each elementary, middle and high school. It is important to note that zone data only begins from 2007. For students who began elementary, middle or high school before 2007, we do NOT have the zoned dbn. Therefore, naturally these students are not included on maps of attended school vs zoned school. For all other counts, these students are included.
Data Quirks: Students are zoned to an elementary, middle and high school every time he/she is zoned. We had to detirmine the year a student entered elementary school, for example, and then find that year for that student in the zoned data to detirmine the zone that student was mapped to the year he/she entered elementary school. Zones change quite frequently and this granular data was important for the numbers later on.
# reformat df to wide format
bio_distinct_schools <- bio_distinct_schools %>% spread(schools, dbn)
#join the data frames. All records that do not have a match are dropped. This ensures that the map will only include data that exists
join_zone_bio <- inner_join(bio_distinct_schools, distinct_zones, by = c("year", "student_id_scram"))
join_zone_bio
## # A tibble: 2,247,484 x 11
## # Groups: student_id_scram [?]
## student_id_scram year em hs mid res_zip_cde zoned_elm_dbn
## <int> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 2160 2007 <NA> <NA> 20K187 11209 20K185
## 2 2160 2010 <NA> 02M475 <NA> 11209 20K185
## 3 2259 2013 21K188 <NA> <NA> 11224 21K188
## 4 2320 2007 <NA> <NA> 04M050 10029 04M050
## 5 2320 2010 <NA> 02M655 <NA> 10029 04M050
## 6 2339 2007 <NA> <NA> 11X144 10469 11X078
## 7 2339 2009 <NA> 08X405 <NA> 10469 11X111
## 8 2400 2013 30Q002 <NA> <NA> 11370 30Q002
## 9 2551 2012 02M151 <NA> <NA> 10128 02M151
## 10 2612 2007 <NA> <NA> 21K303 11235 21K100
## # ... with 2,247,474 more rows, and 4 more variables: zoned_mid_dbn <chr>,
## # zoned_hs_dbn <chr>, census_block <chr>, census_tract <chr>
join_zone_bio_matches <- join_zone_bio
join_zone_bio_matches <- join_zone_bio_matches%>%
mutate(matchEM = ifelse(!is.na(zoned_elm_dbn), ifelse(zoned_elm_dbn == em, 1, 0), -1))
join_zone_bio_matches <- join_zone_bio_matches%>%
mutate(matchMID = ifelse(!is.na(zoned_mid_dbn), ifelse(zoned_mid_dbn == mid, 1, 0), -1))
join_zone_bio_matches <- join_zone_bio_matches%>%
mutate(matchHS = ifelse(!is.na(zoned_hs_dbn), ifelse(zoned_hs_dbn == em, 1, 0), -1))
add a collumn for boro * note that some DBNs are not in district-boro-n format.
# some dbns don't have a 0 boro and the dbn is in not in standard dbn format
# 69,974 have a zoned boro of zero, 168,331 have a standard zoned boro
em_data <- join_zone_bio_matches %>%filter(!is.na(em)) %>% mutate(zoned_boro = substr(zoned_elm_dbn, 3,3))
em_data
## # A tibble: 815,633 x 15
## # Groups: student_id_scram [815,633]
## student_id_scram year em hs mid res_zip_cde zoned_elm_dbn
## <int> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 2259 2013 21K188 <NA> <NA> 11224 21K188
## 2 2400 2013 30Q002 <NA> <NA> 11370 30Q002
## 3 2551 2012 02M151 <NA> <NA> 10128 02M151
## 4 12003 2008 75X186 <NA> <NA> 10467 11X103
## 5 12116 2009 09X449 <NA> <NA> 10452 09X114
## 6 12507 2007 17K167 <NA> <NA> 11213 17K398
## 7 12772 2007 24Q071 <NA> <NA> 11385 24Q071
## 8 22002 2009 17K191 <NA> <NA> 11233 17K191
## 9 22016 2010 06M004 <NA> <NA> 10032 06M004
## 10 22059 2010 18K233 <NA> <NA> 11212 18K219
## # ... with 815,623 more rows, and 8 more variables: zoned_mid_dbn <chr>,
## # zoned_hs_dbn <chr>, census_block <chr>, census_tract <chr>,
## # matchEM <dbl>, matchMID <dbl>, matchHS <dbl>, zoned_boro <chr>
#%>%
#group_by(zoned_boro) %>%
# summarise(count = n())
# number of students zoned to a school (includes all years of data), we don't know how many seats each school has
em_data %>% group_by(zoned_elm_dbn) %>% summarise(num_zoned_to_dbn = n())
## # A tibble: 888 x 2
## zoned_elm_dbn num_zoned_to_dbn
## <chr> <int>
## 1 00678 1
## 2 00921 1
## 3 00976 1
## 4 01M015 41
## 5 01M019 56
## 6 01M020 42
## 7 01M034 36
## 8 01M063 62
## 9 01M064 91
## 10 01M110 47
## # ... with 878 more rows
# number of students who went to a specific elementary school
local_zones_data <- full_join( (em_data %>% group_by(zoned_elm_dbn) %>% summarise(zoned_to_dbn = n())),
(em_data %>% group_by(em) %>% summarise(went_to_dbn = n())),
by = c("zoned_elm_dbn" = "em") )
local_zones_data <- full_join( local_zones_data,
(join_zone_bio_matches %>% filter(matchEM == 1) %>% group_by(zoned_elm_dbn) %>%
summarise(attending_zoned_dbn = n())), by = "zoned_elm_dbn")
local_zones_data <- local_zones_data %>% mutate(perc_local = (attending_zoned_dbn/went_to_dbn)*100)
local_zones_data %>% mutate(boro = substr(zoned_elm_dbn, 3, 3)) %>% filter(!is.na(attending_zoned_dbn)) %>% group_by(boro) %>% summarize(sum_attended_zoned=sum(attending_zoned_dbn), sum_zoned= sum(zoned_to_dbn)) %>%
mutate(perc_boro_att_zoned = (sum_attended_zoned/sum_zoned)*100)
## # A tibble: 5 x 4
## boro sum_attended_zoned sum_zoned perc_boro_att_zoned
## <chr> <int> <int> <dbl>
## 1 K 141682 227647 62.23759
## 2 M 47835 81812 58.46942
## 3 Q 156370 206773 75.62399
## 4 R 34133 43639 78.21673
## 5 X 100669 155324 64.81226
local_zones_data
## # A tibble: 1,313 x 5
## zoned_elm_dbn zoned_to_dbn went_to_dbn attending_zoned_dbn perc_local
## <chr> <int> <int> <int> <dbl>
## 1 00678 1 NA NA NA
## 2 00921 1 NA NA NA
## 3 00976 1 NA NA NA
## 4 01M015 41 309 5 1.618123
## 5 01M019 56 468 10 2.136752
## 6 01M020 42 951 21 2.208202
## 7 01M034 36 323 8 2.476780
## 8 01M063 62 299 11 3.678930
## 9 01M064 91 412 21 5.097087
## 10 01M110 47 614 13 2.117264
## # ... with 1,303 more rows
{r} # # # some dbns don't have a 0 boro and the dbn is in not in standard dbn format # match_df %>% filter(match_dbn == 0) %>% mutate(zoned_boro = substr(zoned_elm_dbn, 3,3)) #%>% # group_by(zoned_boro) %>% summarise(count = n()) # # # 69974 have a zoned boro of zero, 168331 have a standard zoned boro # match_df %>% filter(match_dbn == 0) %>% mutate(zoned_boro = substr(zoned_elm_dbn, 3,3)) %>% # mutate(to_boro = substr(BioDbn, 3, 3)) %>% group_by(zoned_boro == 0) %>% summarise(count = n()) # # # count of students who went to # match_df %>% filter(match_dbn == 0) %>% mutate(zoned_boro = substr(zoned_elm_dbn, 3,3)) %>% # mutate(to_boro = substr(BioDbn, 3, 3)) # # # number of students zoned to a school # match_df %>% group_by(zoned_elm_dbn) %>% summarise(zoned_to_dbn = n()) # # # number of studebts who went to a specific school # local_zones_data <- full_join( (match_df %>% group_by(zoned_elm_dbn) %>% summarise(zoned_to_dbn = n())), # (match_df %>% group_by(BioDbn) %>% summarise(went_to_dbn = n())), # by = c("zoned_elm_dbn" = "BioDbn") ) # # local_zones_data <- full_join( local_zones_data, # (match_df %>% filter(match_dbn == 1) %>% group_by(zoned_elm_dbn) %>% # summarise(attending_zoned_dbn = n())), # by = "zoned_elm_dbn") # # local_zones_data <- local_zones_data %>% mutate(perc_local = (attending_zoned_dbn/went_to_dbn)*100) # local_zones_data %>% mutate(boro = substr(zoned_elm_dbn, 3, 3)) %>% filter(!is.na(attending_zoned_dbn)) %>% group_by(boro) %>% summarize(sum_attended_zoned=sum(attending_zoned_dbn), sum_zoned= sum(zoned_to_dbn)) %>% # mutate(perc_boro_att_zoned = (sum_attended_zoned/sum_zoned)*100) # # # local_zones_data ##get map dbns data nyc open data
r<- GET("https://data.cityofnewyork.us/api/geospatial/cq6p-iwiy?method=export&format=GeoJSON")
dbns <- readOGR(content(r,'text'), 'OGRGeoJSON', verbose = F)
## No encoding supplied: defaulting to UTF-8.
## Warning in readOGR(content(r, "text"), "OGRGeoJSON", verbose = F): Dropping
## null geometries: 752
#dbns@polygons
#dbns@polygons[[1]]@Polygons[[1]]@coords[1,]
#dbns@polygons[[2]]@Polygons[[1]]@coords[1,]
#as.data.frame()
df_spatial_dbns <- as.data.frame(dbns@data$dbn)
# df_lat_long
# for (i in 1:2) {
# df_lat_long[i] <- list(dbns@polygons[[i]]@Polygons[[1]]@coords[1,])
# }
#loop through polygons to get lat longs
df_lat_long <- vector(mode = "list", length = 779)
for (i in 1:779) {
df_lat_long[[i]] <- dbns@polygons[[i]]@Polygons[[1]]@coords[1,]
}
lat_long <- do.call(rbind, df_lat_long)
dbns_latlong_bind <- cbind(df_spatial_dbns, lat_long)
colnames(dbns_latlong_bind)[1] <- "dbn"
colnames(dbns_latlong_bind)[2] <- "lat"
colnames(dbns_latlong_bind)[3] <- "long"
#elem_bio <- df %>% filter(!is.na(zoned_elm_dbn))
#map_data <- merge(, by.x = "dbn", by.y = "")
leaflet(dbns) %>%
addTiles() %>%
addPolygons(popup = ~paste("School DBN:", dbn))%>%
addProviderTiles("CartoDB.Positron")%>%
setView(-73.98, 40.75, zoom = 13)
# Map of percent of students in each zone who attended the school he/she was zoned to
map_data <- merge(dbns, local_zones_data, by.x = "dbn", by.y ="zoned_elm_dbn")
# how many students went to their local school
leaflet(map_data) %>%
addTiles() %>%
addPolygons(popup = ~paste("Percent Local:", round(perc_local),"%"))%>%
addProviderTiles("CartoDB.Positron")%>%
setView(-73.98, 40.75, zoom = 13)
#addCircles(~lng, ~lat, popup=ct$type, weight = 3, radius=40,
# color="#ffa500", stroke = TRUE, fillOpacity = 0.8)
# leaflet(map_data) %>%
# addTiles() %>%
# addWebGLHeatmap()%>%
# addPolygons(popup = ~paste("Percent Local:", round(perc_local),"%"))%>%
# addProviderTiles("CartoDB.Positron")%>%
# setView(-73.98, 40.75, zoom = 13)
#
# leaflet(map_data) %>%
# addProviderTiles(providers$CartoDB) %>%
# setView(-73.98, 40.75, zoom = 13 ) %>%
# addHeatmap(~ , ~, intensity = ~dbn,
# blur = 20, max = 0.05, radius = 15)
pal <- colorNumeric(palette = "Blues",
domain = range(map_data@data$perc_local, na.rm=T))
leaflet(map_data) %>%
#addTiles() %>%
addPolygons(fillColor = ~pal(perc_local), popup = ~perc_local, weight = 0) %>%
addProviderTiles("CartoDB.Positron") %>%
setView(-73.98, 40.75, zoom = 13)